require(graphics)
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.92 loaded
library(magrittr)
library(ggfortify)
ewcs=read.table("EWCS_2016.csv",sep=",",header=TRUE)
ewcs[,][ewcs[, ,] == -999] <- NA
kk=complete.cases(ewcs)
ewcs=ewcs[kk,]
ewcs_summary <- prcomp(ewcs,center = TRUE,scale = TRUE)
print(ewcs_summary)
## Standard deviations (1, .., p=11):
## [1] 2.0984525 1.1873661 1.0105780 0.9706087 0.8831901 0.7496949 0.7129856
## [8] 0.6520052 0.5956642 0.5649903 0.5232145
##
## Rotation (n x k) = (11 x 11):
## PC1 PC2 PC3 PC4 PC5 PC6
## Q2a 0.03203956 -0.1386327 0.796373784 0.57638908 -6.171166e-02 0.01266193
## Q2b 0.07652230 -0.2204528 -0.584133839 0.76073419 7.105450e-02 0.00515712
## Q87a 0.39103574 -0.1996019 -0.038763673 -0.07849823 -3.148653e-02 0.02786038
## Q87b 0.37759153 -0.2359578 0.077079602 -0.16741716 -4.488656e-02 0.08133873
## Q87c 0.39652146 -0.2056496 -0.004550283 -0.03679735 -1.796326e-02 0.05172394
## Q87d 0.37141006 -0.2534245 0.062704331 -0.09378305 -5.747547e-05 0.14878121
## Q87e 0.36263461 -0.1259478 -0.059239797 -0.08174241 -3.299479e-02 -0.14466435
## Q90a 0.33784962 0.3007859 0.002609147 0.12630062 1.210966e-01 -0.20735048
## Q90b 0.27485090 0.4436706 0.054692725 0.05645151 2.715430e-01 -0.62004729
## Q90c 0.22363116 0.5038874 0.015633656 0.08498139 3.729994e-01 0.71576928
## Q90f 0.17680118 0.4160141 -0.080175825 0.10806045 -8.713190e-01 0.08308652
## PC7 PC8 PC9 PC10 PC11
## Q2a -0.092896333 0.01060833 -0.02186704 0.005570217 -0.009937025
## Q2b 0.005490225 -0.12077633 0.01372336 0.070150927 0.028533515
## Q87a -0.179630910 -0.07249015 -0.62888270 -0.261821916 -0.544290070
## Q87b 0.158131884 -0.36972267 -0.28416331 0.574242537 0.432370395
## Q87c 0.097909301 0.16394260 0.07090937 -0.668278083 0.554994648
## Q87d 0.360503139 -0.19974893 0.62699603 0.013748398 -0.446981468
## Q87e -0.712223779 0.37711846 0.29943899 0.284447504 0.019249723
## Q90a 0.495518337 0.62792742 -0.15661941 0.226314662 -0.078670047
## Q90b -0.100309447 -0.47561321 0.09433706 -0.131861974 0.026155642
## Q90c -0.177731352 -0.06749933 0.01374979 0.001025211 0.028835352
## Q90f -0.008177733 -0.09643027 0.04070890 -0.020991855 -0.002154017
screeplot(ewcs_summary, main = "Screeplot of ewcs", col = "steelblue", type = "line", pch = 1, npcs = length(ewcs_summary$sdev))
summary(ewcs_summary)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0985 1.1874 1.01058 0.97061 0.88319 0.74969 0.71299
## Proportion of Variance 0.4003 0.1282 0.09284 0.08564 0.07091 0.05109 0.04621
## Cumulative Proportion 0.4003 0.5285 0.62133 0.70697 0.77788 0.82898 0.87519
## PC8 PC9 PC10 PC11
## Standard deviation 0.65201 0.59566 0.56499 0.52321
## Proportion of Variance 0.03865 0.03226 0.02902 0.02489
## Cumulative Proportion 0.91384 0.94609 0.97511 1.00000
biplot(ewcs_summary)
ewcs <- setNames(ewcs, c("Gender","Age","L_Cheerful","L_Calm","L_Active","L_WakeUpFreshed","L_Interesting","W_Energetic","W_Enthusiastic","W_TimeFlies","W_Expert"))
LifeWork_satisfaction <- ewcs[, 3:11]
life_satisfaction <- ewcs[, 3:7]
work_satisfaction <- ewcs[, 8:11]
apply(life_satisfaction, 2, var)
## L_Cheerful L_Calm L_Active L_WakeUpFreshed L_Interesting
## 1.228888 1.494328 1.311350 1.636771 1.411519
apply(work_satisfaction, 2, var)
## W_Energetic W_Enthusiastic W_TimeFlies W_Expert
## 0.7167108 1.0269436 0.9390320 0.4536516
LifeWork_satisfaction_summary <- prcomp(LifeWork_satisfaction, center = TRUE,scale = TRUE)
print(LifeWork_satisfaction_summary)
## Standard deviations (1, .., p=9):
## [1] 2.0928204 1.1737194 0.8844224 0.7497501 0.7159772 0.6585330 0.5960200
## [8] 0.5676965 0.5237831
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5
## L_Cheerful -0.3913935 0.2145874 -0.03657888 0.03076865 -0.18884134
## L_Calm -0.3791006 0.2659415 -0.02985531 0.08182211 0.16291474
## L_Active -0.3965664 0.2153921 -0.01930540 0.05021775 0.09626028
## L_WakeUpFreshed -0.3718983 0.2719571 0.01071816 0.14547235 0.36815180
## L_Interesting -0.3634299 0.1412722 -0.04224498 -0.13683679 -0.72104140
## W_Energetic -0.3392943 -0.3131456 0.11803397 -0.21376803 0.48446971
## W_Enthusiastic -0.2777600 -0.4467453 0.27766299 -0.61876652 -0.09789550
## W_TimeFlies -0.2263318 -0.5125126 0.37107470 0.71737559 -0.16969792
## W_Expert -0.1787233 -0.4273225 -0.87565589 0.08218019 0.00185347
## PC6 PC7 PC8 PC9
## L_Cheerful 0.04881360 0.63571903 0.266498682 0.534111082
## L_Calm 0.40807759 0.27941113 -0.578403554 -0.415068521
## L_Active -0.18414840 -0.06535586 0.644941326 -0.574499260
## L_WakeUpFreshed 0.20550712 -0.61946594 0.013531916 0.452508421
## L_Interesting -0.34425995 -0.31473490 -0.299450773 -0.017535605
## W_Energetic -0.63313474 0.14717915 -0.252825622 0.077701110
## W_Enthusiastic 0.47109683 -0.08517536 0.154921357 -0.022979738
## W_TimeFlies 0.06334863 -0.01398893 0.004831339 -0.027406265
## W_Expert 0.09811384 -0.04013732 0.026073459 0.002463957
screeplot(LifeWork_satisfaction_summary, main = "Screeplot of life and work satisfaction summary", col = "steelblue", type = "lines", pch = 1, npcs = length(LifeWork_satisfaction_summary$sdev))
summary(LifeWork_satisfaction_summary)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0928 1.1737 0.88442 0.74975 0.71598 0.65853 0.59602
## Proportion of Variance 0.4867 0.1531 0.08691 0.06246 0.05696 0.04819 0.03947
## Cumulative Proportion 0.4867 0.6397 0.72664 0.78909 0.84605 0.89424 0.93371
## PC8 PC9
## Standard deviation 0.56770 0.52378
## Proportion of Variance 0.03581 0.03048
## Cumulative Proportion 0.96952 1.00000
life_satisfaction_summary <- prcomp(life_satisfaction, center = TRUE,scale = TRUE)
print(life_satisfaction_summary)
## Standard deviations (1, .., p=5):
## [1] 1.8785419 0.7090903 0.5996132 0.5771453 0.5250130
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## L_Cheerful -0.4584578 0.13368851 -0.5935702 -0.3479363 0.54640511
## L_Calm -0.4524134 -0.33233736 -0.4477102 0.5371680 -0.44258351
## L_Active -0.4616912 -0.05692524 0.2408944 -0.6634512 -0.53423133
## L_WakeUpFreshed -0.4443103 -0.48734107 0.5605755 0.1774021 0.46837026
## L_Interesting -0.4178135 0.79431572 0.2737792 0.3446039 -0.02806228
screeplot(life_satisfaction_summary, main = "Screeplot of life satisfaction summary", col = "steelblue", type = "lines", pch = 1, npcs = length(life_satisfaction_summary$sdev))
summary(life_satisfaction_summary) # PC1, 71%
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.8785 0.7091 0.59961 0.57715 0.52501
## Proportion of Variance 0.7058 0.1006 0.07191 0.06662 0.05513
## Cumulative Proportion 0.7058 0.8064 0.87825 0.94487 1.00000
work_satisfaction_summary <- prcomp(work_satisfaction,center = TRUE, scale = TRUE)
print(work_satisfaction_summary)
## Standard deviations (1, .., p=4):
## [1] 1.4725145 0.8866447 0.7632521 0.6804472
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## W_Energetic 0.5485744 -0.1317048 0.39025603 -0.727612648
## W_Enthusiastic 0.5381708 -0.2577844 0.42525792 0.680496212
## W_TimeFlies 0.5017631 -0.2990783 -0.81165116 -0.002896211
## W_Expert 0.3970795 0.9092597 -0.08987946 0.086581471
screeplot(work_satisfaction_summary, main = "Screeplot of work satisfaction summary", col = "steelblue", type = "lines", pch = 1, npcs = length(work_satisfaction_summary$sdev))
summary(work_satisfaction_summary)# PC2, 74%
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.4725 0.8866 0.7633 0.6804
## Proportion of Variance 0.5421 0.1965 0.1456 0.1158
## Cumulative Proportion 0.5421 0.7386 0.8842 1.0000
ewcs_cor <- cor(ewcs)
corrplot(ewcs_cor,main = "ewcs correlation matrix",method="number",number.cex = 0.5,tl.cex = 0.5,cex.main = 0.5)
* From the correlation matrix, we can identify that gender and age have very low or no correlation to each other and also to life and work satisfaction variables. Furthermore, we can once again identify that life satisfaction variables(cheerful,calm,active,wakeupfreshed,interesting) and work satisfaction variables(energetic,enthusiastic,timeflies,expert) have correlation within each of the variables.
ewcs_sum <- ewcs
ewcs_sum$Life <- ewcs_sum$L_Cheerful + ewcs_sum$L_Calm + ewcs_sum$L_Active + ewcs_sum$L_WakeUpFreshed + ewcs_sum$L_Interesting
ewcs_sum$Work <- ewcs_sum$W_Energetic + ewcs_sum$W_Enthusiastic + ewcs_sum$W_TimeFlies + ewcs_sum$W_Expert
ewcs_sum <- ewcs_sum[,-(3:11)]
ewcs_sum_cor <- cor(ewcs_sum)
corrplot(ewcs_sum_cor,main = "adjusted ewcs correlation matrix",method="number",tl.cex = 0.8,cex.main = 0.5)
ewcs$Gender <- factor(ifelse(ewcs$Gender > 1, "Female", "Male"))
ewcs1<- prcomp(ewcs[,2:11],center = TRUE,scale = TRUE)
print(ewcs1)
## Standard deviations (1, .., p=10):
## [1] 2.0976195 1.1839966 0.9845291 0.8836492 0.7497450 0.7159253 0.6520523
## [8] 0.5959147 0.5650083 0.5232812
##
## Rotation (n x k) = (10 x 10):
## PC1 PC2 PC3 PC4 PC5
## Age -0.07673934 0.2341908 -0.95486168 0.0819185588 -0.004104825
## L_Cheerful -0.39125911 0.2059654 0.04058376 -0.0374821542 -0.030975693
## L_Calm -0.37753362 0.2372082 0.18426095 -0.0479502675 -0.080092914
## L_Active -0.39650599 0.2079502 0.03072206 -0.0185781809 -0.050666936
## L_WakeUpFreshed -0.37122407 0.2530487 0.11925417 0.0009277797 -0.144603686
## L_Interesting -0.36298615 0.1336218 0.02745223 -0.0419885224 0.136164504
## W_Energetic -0.33809406 -0.3038659 -0.09949849 0.1280184143 0.212655275
## W_Enthusiastic -0.27520815 -0.4478020 -0.01502261 0.2750553376 0.619425132
## W_TimeFlies -0.22412564 -0.5062590 -0.06314383 0.3741688674 -0.717620023
## W_Expert -0.17729376 -0.4149388 -0.15036032 -0.8691545718 -0.081868116
## PC6 PC7 PC8 PC9 PC10
## Age 0.0118456707 -0.12125295 0.01425527 -0.069989462 0.028862740
## L_Cheerful -0.1883237393 -0.07180931 -0.62892995 0.262790811 -0.543368024
## L_Calm 0.1562341870 -0.36796703 -0.28702047 -0.573755277 0.432608037
## L_Active 0.0987383678 0.16406236 0.07268251 0.668415943 0.554035410
## L_WakeUpFreshed 0.3653977046 -0.19854186 0.62304469 -0.016138100 -0.449051138
## L_Interesting -0.7189684620 0.37294108 0.30360855 -0.284146599 0.020265834
## W_Energetic 0.4906423044 0.63108858 -0.15916957 -0.226472152 -0.078865483
## W_Enthusiastic -0.1021324008 -0.47627868 0.09430757 0.131642362 0.025868735
## W_TimeFlies -0.1692421817 -0.06895426 0.01549524 -0.000659684 0.029383308
## W_Expert 0.0007188292 -0.09722227 0.04181626 0.021165146 -0.001794592
screeplot(ewcs1, main = "Screeplot of ewcs1", col = "steelblue", type = "line", pch = 1, npcs = length(ewcs1$sdev))
summary(ewcs1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.098 1.1840 0.98453 0.88365 0.74975 0.71593 0.65205
## Proportion of Variance 0.440 0.1402 0.09693 0.07808 0.05621 0.05125 0.04252
## Cumulative Proportion 0.440 0.5802 0.67712 0.75520 0.81141 0.86267 0.90518
## PC8 PC9 PC10
## Standard deviation 0.59591 0.56501 0.52328
## Proportion of Variance 0.03551 0.03192 0.02738
## Cumulative Proportion 0.94069 0.97262 1.00000
autoplot(ewcs1, data = ewcs, colour = 'Gender',
label = FALSE,
loadings = TRUE, loadings.colour = 'steelblue',
loadings.label = TRUE,
loadings.label.size = 4,
loadings.label.colour = "black",
loadings.label.repel=T) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
axis.title = element_text(size = 8))
ewcs2<- prcomp(ewcs_sum[,2:4],center = TRUE,scale = TRUE)
print(ewcs2)
## Standard deviations (1, .., p=3):
## [1] 1.2280820 0.9918849 0.7127264
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## Age -0.2357538 0.95767696 -0.1651514
## Life -0.7005319 -0.04968643 0.7118892
## Work -0.6735541 -0.28352442 -0.6825971
screeplot(ewcs2, main = "Screeplot of ewcs2", col = "steelblue", type = "line", pch = 1, npcs = length(ewcs2$sdev))
summary(ewcs2)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.2281 0.9919 0.7127
## Proportion of Variance 0.5027 0.3280 0.1693
## Cumulative Proportion 0.5027 0.8307 1.0000
autoplot(ewcs2, data = ewcs, colour = 'Gender',
label = FALSE,
loadings = TRUE, loadings.colour = 'steelblue',
loadings.label = TRUE,
loadings.label.size = 4,
loadings.label.colour = "black",
loadings.label.repel=T) +
theme(legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
axis.title = element_text(size = 8))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(caTools)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-3
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggpubr)
library(e1071)
school1=read.table("student-mat.csv",sep=";",header=TRUE) # Mathematics
school2=read.table("student-por.csv",sep=";",header=TRUE) # Portuguese
schools=merge(school1, school2, by=c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"))
schools_sex <- schools %>%
group_by(sex) %>%
summarise(Average_G3_mat = mean(G3.x),Average_G3_por = mean(G3.y))
ggplot(data = schools_sex, mapping = aes(x = sex, y = Average_G3_mat)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_G3_mat,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Average mat G3 ",x = "Gender", y ="Average G3")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = schools_sex, mapping = aes(x = sex, y = Average_G3_por)) +
geom_bar(stat='identity',fill="steelblue") +
geom_text(mapping = aes(label = round(Average_G3_por,0), fontface = 'bold', vjust = -0.2), size = 2) +
labs(title = "Average por G3",x = "Gender", y ="Average G3")+
theme(plot.title = element_text(hjust = 0.5))
drop <- c("G1.x","G2.x","G1.y","G2.y")
schools_G3 = schools[,!(names(schools) %in% drop)]
mat <- schools_G3[,1:31]
por <- schools_G3[, -c(14:31)]
mat_numeric <- select_if(mat, is.numeric)
mat_numeric_cor <-cor(mat_numeric)
corrplot(mat_numeric_cor,main = "Correlation matrix of mat", method="number",number.cex = 0.5,tl.cex = 0.5,cex.main = 0.5)
por_numeric <- select_if(por, is.numeric)
por_numeric_por <-cor(por_numeric)
corrplot(por_numeric_por, main = "Correlation matrix of por",method="number",number.cex = 0.5,tl.cex = 0.5,cex.main = 0.5)
hist(mat$G3.x)
hist(por$G3.y)
set.seed(1)
mat_training_sample <- createDataPartition(mat$G3.x,p = 0.7, list = FALSE)
por_training_sample <- createDataPartition(por$G3.y,p = 0.7, list = FALSE)
mat_dataset<- mat[mat_training_sample,]
por_dataset<- por[por_training_sample,]
mat_validation <- mat[-mat_training_sample,]
por_validation <- por[-por_training_sample,]
mat_dataset_output <- mat_dataset[,31]
por_dataset_output <- por_dataset[,31]
mat_dataset_input <- mat_dataset[,1:30]
por_dataset_input <- por_dataset[,1:30]
mat_validation_output <- mat_validation[,31]
por_validation_output <- por_validation[,31]
mat_validation_input <- mat_validation[,1:30]
por_validation_input <- por_validation[,1:30]
set.seed(1)
mat_lm <- lm(G3.x~.,mat_dataset)
por_lm <- lm(G3.y~.,por_dataset)
sqrt(mean(mat_lm$residuals^2)) # 3.730919
## [1] 3.730919
sqrt(mean(por_lm$residuals^2)) # 2.110835
## [1] 2.110835
summary(mat_lm)$r.squared # 0.3454868
## [1] 0.3454868
summary(por_lm)$r.squared # 0.4543227
## [1] 0.4543227
summary(mat_lm)
##
## Call:
## lm(formula = G3.x ~ ., data = mat_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1766 -1.7096 0.1935 2.5293 9.9706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.92562 5.44437 4.211 3.66e-05 ***
## schoolMS 0.47542 1.03920 0.457 0.64775
## sexM 1.01717 0.64621 1.574 0.11686
## age -0.74518 0.26058 -2.860 0.00463 **
## addressU 0.33932 0.72129 0.470 0.63849
## famsizeLE3 0.69009 0.59552 1.159 0.24774
## PstatusT -0.82357 0.99182 -0.830 0.40720
## Medu 0.43632 0.40315 1.082 0.28027
## Fedu 0.08168 0.34416 0.237 0.81262
## Mjobhealth 1.10784 1.37531 0.806 0.42135
## Mjobother 0.08748 0.85546 0.102 0.91864
## Mjobservices 1.37611 1.01273 1.359 0.17554
## Mjobteacher -0.91696 1.24123 -0.739 0.46081
## Fjobhealth -2.50385 2.00255 -1.250 0.21245
## Fjobother -3.81266 1.59727 -2.387 0.01780 *
## Fjobservices -3.51258 1.65894 -2.117 0.03531 *
## Fjobteacher -0.91214 1.87950 -0.485 0.62792
## reasonhome 0.71092 0.67424 1.054 0.29281
## reasonother 0.61169 1.02420 0.597 0.55094
## reasonreputation 0.76876 0.70535 1.090 0.27690
## nurseryyes -0.79165 0.68267 -1.160 0.24741
## internetyes 0.67127 0.80404 0.835 0.40467
## guardian.xmother -0.34256 0.66013 -0.519 0.60431
## guardian.xother -1.79553 1.36644 -1.314 0.19015
## traveltime.x 0.18867 0.42965 0.439 0.66099
## studytime.x 0.30878 0.33417 0.924 0.35645
## failures.x -1.35897 0.43746 -3.107 0.00213 **
## schoolsup.xyes -1.24401 0.86278 -1.442 0.15071
## famsup.xyes -0.08967 0.59452 -0.151 0.88025
## paid.xyes -0.46801 0.59184 -0.791 0.42990
## activities.xyes -0.32750 0.54301 -0.603 0.54702
## higher.xyes 1.53205 1.36814 1.120 0.26397
## romantic.xyes -1.17527 0.58034 -2.025 0.04401 *
## famrel.x 0.53899 0.28361 1.900 0.05863 .
## freetime.x 0.10248 0.28727 0.357 0.72161
## goout.x -0.45759 0.27966 -1.636 0.10316
## Dalc.x -0.36279 0.38615 -0.940 0.34846
## Walc.x 0.34907 0.30155 1.158 0.24823
## health.x -0.30332 0.20684 -1.466 0.14389
## absences.x 0.02596 0.03438 0.755 0.45105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.044 on 229 degrees of freedom
## Multiple R-squared: 0.3455, Adjusted R-squared: 0.234
## F-statistic: 3.099 on 39 and 229 DF, p-value: 6.764e-08
summary(por_lm)
##
## Call:
## lm(formula = G3.y ~ ., data = por_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0148 -1.2765 -0.0597 1.3178 5.9530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.40000 3.00086 3.132 0.00196 **
## schoolMS -1.01443 0.62462 -1.624 0.10573
## sexM -1.05899 0.34360 -3.082 0.00231 **
## age 0.14777 0.14546 1.016 0.31078
## addressU 0.80174 0.38183 2.100 0.03685 *
## famsizeLE3 0.08196 0.34279 0.239 0.81124
## PstatusT -0.42597 0.49596 -0.859 0.39130
## Medu 0.20985 0.23365 0.898 0.37006
## Fedu 0.05706 0.18949 0.301 0.76358
## Mjobhealth 0.15263 0.78145 0.195 0.84532
## Mjobother -0.68465 0.51325 -1.334 0.18355
## Mjobservices -0.18854 0.58094 -0.325 0.74582
## Mjobteacher -0.13162 0.71286 -0.185 0.85368
## Fjobhealth -2.20569 1.21228 -1.819 0.07015 .
## Fjobother -1.40903 0.96510 -1.460 0.14566
## Fjobservices -1.59360 0.99129 -1.608 0.10930
## Fjobteacher -0.11762 1.06657 -0.110 0.91229
## reasonhome 0.31908 0.38530 0.828 0.40846
## reasonother -0.36003 0.56322 -0.639 0.52331
## reasonreputation 0.57797 0.39999 1.445 0.14983
## nurseryyes -0.08931 0.40170 -0.222 0.82426
## internetyes 0.19365 0.43459 0.446 0.65632
## guardian.ymother -0.21374 0.39127 -0.546 0.58541
## guardian.yother -0.83864 0.83423 -1.005 0.31582
## traveltime.y -0.11330 0.23764 -0.477 0.63399
## studytime.y 0.57266 0.20300 2.821 0.00521 **
## failures.y -0.48809 0.32240 -1.514 0.13143
## schoolsup.yyes -1.89600 0.47036 -4.031 7.57e-05 ***
## famsup.yyes 0.21173 0.32344 0.655 0.51336
## paid.yyes -1.07735 0.67978 -1.585 0.11438
## activities.yyes 0.26760 0.31417 0.852 0.39524
## higher.yyes 2.71325 0.72702 3.732 0.00024 ***
## romantic.yyes -0.23010 0.32636 -0.705 0.48149
## famrel.y 0.10580 0.16613 0.637 0.52487
## freetime.y 0.02260 0.16476 0.137 0.89101
## goout.y -0.36911 0.15231 -2.423 0.01615 *
## Dalc.y 0.14267 0.24358 0.586 0.55864
## Walc.y -0.02014 0.17231 -0.117 0.90706
## health.y -0.22679 0.10659 -2.128 0.03443 *
## absences.y -0.05787 0.02998 -1.930 0.05479 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.288 on 229 degrees of freedom
## Multiple R-squared: 0.4543, Adjusted R-squared: 0.3614
## F-statistic: 4.889 on 39 and 229 DF, p-value: 8.432e-15
set.seed(1)
mat_lm_validate <- lm(G3.x~.,mat_validation)
por_lm_validate <- lm(G3.y~.,por_validation)
sqrt(mean(mat_lm_validate$residuals^2))
## [1] 3.341797
sqrt(mean(por_lm_validate$residuals^2))
## [1] 2.00442
summary(mat_lm_validate)$r.squared
## [1] 0.5227731
summary(por_lm_validate)$r.squared
## [1] 0.5901086
Train
RMSE: 3.730919(mat) 2.110835(por)
R2: 0.3454868(mat) 0.4543227(por)
Test
RMSE: 3.341797(mat) 2.00442(por)
R2: 0.5227731(mat) 0.5901086(por)
par(mfrow=c(2,2))
plot(mat_lm, main="mat")
plot(por_lm, main="por")
No distinctive pattern for residuals vs. fitted.
Looking at normal Q-Q plot, distribution of residuals seem heavy on the tails.
Homoscedasticity for Scale-location.
Looking at residuals vs. leverage, we cant see Cook’s distance lines, hence there are no influential case.
Furthermore, besides Q-Q plot, we can check the normality with density plot and normality test.
ggdensity(mat$G3.x)+
labs(title = "Density plot of G3.x",x = "G3", y ="Density")
ggdensity(por$G3.y)+
labs(title = "Density plot of G3.y",x = "G3", y ="Density")
skewness(mat$G3.x) #-0.70 = moderately skewed
## [1] -0.7003219
skewness(por$G3.y) #-0.99 = moderately skewed
## [1] -0.9891237
shapiro.test(mat_lm$residuals)
##
## Shapiro-Wilk normality test
##
## data: mat_lm$residuals
## W = 0.97212, p-value = 4.157e-05
shapiro.test(por_lm$residuals)
##
## Shapiro-Wilk normality test
##
## data: por_lm$residuals
## W = 0.97997, p-value = 0.0007823
vif(mat_lm)
## GVIF Df GVIF^(1/(2*Df))
## school 1.551232 1 1.245485
## sex 1.710610 1 1.307903
## age 1.563639 1 1.250456
## address 1.410864 1 1.187798
## famsize 1.153658 1 1.074085
## Pstatus 1.215314 1 1.102413
## Medu 3.293445 1 1.814785
## Fedu 2.326553 1 1.525304
## Mjob 4.463926 4 1.205632
## Fjob 2.786882 4 1.136684
## reason 1.874316 3 1.110386
## nursery 1.195593 1 1.093432
## internet 1.401311 1 1.183770
## guardian.x 1.488469 2 1.104549
## traveltime.x 1.474145 1 1.214144
## studytime.x 1.433243 1 1.197181
## failures.x 1.654708 1 1.286354
## schoolsup.x 1.213449 1 1.101567
## famsup.x 1.363518 1 1.167698
## paid.x 1.437258 1 1.198857
## activities.x 1.208949 1 1.099522
## higher.x 1.519253 1 1.232580
## romantic.x 1.205068 1 1.097756
## famrel.x 1.198606 1 1.094809
## freetime.x 1.408967 1 1.186999
## goout.x 1.617268 1 1.271718
## Dalc.x 2.181761 1 1.477079
## Walc.x 2.557493 1 1.599216
## health.x 1.333537 1 1.154789
## absences.x 1.350365 1 1.162052
vif(por_lm)
## GVIF Df GVIF^(1/(2*Df))
## school 1.505811 1 1.227115
## sex 1.516969 1 1.231653
## age 1.579293 1 1.256699
## address 1.358625 1 1.165601
## famsize 1.183824 1 1.088037
## Pstatus 1.141530 1 1.068424
## Medu 3.241728 1 1.800480
## Fedu 2.319684 1 1.523051
## Mjob 3.902512 4 1.185545
## Fjob 2.667473 4 1.130479
## reason 1.818070 3 1.104761
## nursery 1.195824 1 1.093537
## internet 1.352218 1 1.162849
## guardian.y 1.432148 2 1.093949
## traveltime.y 1.360668 1 1.166477
## studytime.y 1.468732 1 1.211912
## failures.y 1.478488 1 1.215931
## schoolsup.y 1.286977 1 1.134450
## famsup.y 1.260747 1 1.122830
## paid.y 1.171723 1 1.082461
## activities.y 1.266808 1 1.125526
## higher.y 1.430342 1 1.195969
## romantic.y 1.175615 1 1.084258
## famrel.y 1.138642 1 1.067072
## freetime.y 1.352646 1 1.163033
## goout.y 1.598642 1 1.264374
## Dalc.y 2.326425 1 1.525262
## Walc.y 2.663851 1 1.632131
## health.y 1.193808 1 1.092615
## absences.y 1.287451 1 1.134659
mat_standard_residuals <-rstandard(mat_lm)
por_standard_residuals <-rstandard(por_lm)
mat_lm_sr <- cbind(mat_dataset, mat_standard_residuals)
por_lm_sr <- cbind(por_dataset, por_standard_residuals)
head(mat_lm_sr[order(-mat_standard_residuals),]$mat_standard_residuals)
## [1] 2.877031 1.885659 1.803399 1.796863 1.777056 1.762084
head(mat_lm_sr[order(mat_standard_residuals),]$mat_standard_residuals)
## [1] -3.011877 -2.994094 -2.713830 -2.672347 -2.619447 -2.618735
head(por_lm_sr[order(-por_standard_residuals),]$por_standard_residuals)
## [1] 2.759523 2.523463 2.420612 2.106098 2.035809 2.026526
head(por_lm_sr[order(por_standard_residuals),]$por_standard_residuals)
## [1] -4.288028 -3.966159 -3.121595 -2.286437 -1.949668 -1.754359
mat_lm_log <- lm(log(G3.x+1)~.,mat_dataset)
por_lm_log <- lm(log(G3.y+1)~.,por_dataset)
par(mfrow=c(2,2))
plot(mat_lm_log)
plot(por_lm_log)
shapiro.test(mat_lm_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: mat_lm_log$residuals
## W = 0.84502, p-value = 1.009e-15
shapiro.test(por_lm_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: por_lm_log$residuals
## W = 0.71013, p-value < 2.2e-16
mat_lm_sq <- lm(sqrt(G3.x)~.,mat_dataset)
por_lm_sq <- lm(sqrt(G3.y)~.,por_dataset)
par(mfrow=c(2,2))
plot(mat_lm_sq)
plot(por_lm_sq)
shapiro.test(mat_lm_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: mat_lm_log$residuals
## W = 0.84502, p-value = 1.009e-15
shapiro.test(por_lm_log$residuals)
##
## Shapiro-Wilk normality test
##
## data: por_lm_log$residuals
## W = 0.71013, p-value < 2.2e-16
mat_lm_adjusted <- lm((G3.x)^(3/2)~.,mat_dataset)
por_lm_adjusted <- lm((G3.y)^(3/2)~.,por_dataset)
par(mfrow=c(2,2))
plot(mat_lm_adjusted, main= "mat adjusted")
plot(por_lm_adjusted, main= "por adjusted")
shapiro.test(mat_lm_adjusted$residuals)
##
## Shapiro-Wilk normality test
##
## data: mat_lm_adjusted$residuals
## W = 0.99496, p-value = 0.5243
shapiro.test(por_lm_adjusted$residuals)
##
## Shapiro-Wilk normality test
##
## data: por_lm_adjusted$residuals
## W = 0.99311, p-value = 0.251
mat_dataset_input_matrix <- model.matrix(G3.x~.-1, mat_dataset)
por_dataset_input_matrix <- model.matrix(G3.y~.-1, por_dataset)
mat_validation_input_matrix <- model.matrix(G3.x~.-1, mat_validation)
por_validation_input_matrix <- model.matrix(G3.y~.-1, por_validation)
set.seed(1)
mat_lasso <- glmnet(mat_dataset_input_matrix, mat_dataset_output^(3/2), alpha = 1)
por_lasso <- glmnet(por_dataset_input_matrix, por_dataset_output^(3/2), alpha = 1)
cv_mat <- cv.glmnet(mat_dataset_input_matrix, mat_dataset_output^(3/2), alpha = 1)
cv_por <- cv.glmnet(por_dataset_input_matrix, por_dataset_output^(3/2), alpha = 1)
best_cv_mat_lambda <- cv_mat$lambda.min
best_cv_mat_lambda
## [1] 1.125309
best_cv_por_lambda <- cv_por$lambda.min
best_cv_por_lambda
## [1] 0.5314042
lambda value of 1.13 for mat and 0.53 for por minimizes MSE.
Plotting MSE test
par(mfrow=c(1,1))
plot(cv_mat, main = "mat MSE test")
plot(cv_por, main = "por MSE test")
best_mat <- glmnet(mat_dataset_input_matrix, mat_dataset_output^(3/2), alpha = 1, lambda = best_cv_mat_lambda)
coef(best_mat)
## 41 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 80.6777834
## schoolGP .
## schoolMS .
## sexM 0.9090960
## age -2.8800057
## addressU .
## famsizeLE3 1.3185763
## PstatusT .
## Medu 1.2383058
## Fedu .
## Mjobhealth 1.9040452
## Mjobother .
## Mjobservices 4.0246620
## Mjobteacher .
## Fjobhealth .
## Fjobother -1.3558806
## Fjobservices .
## Fjobteacher 6.8210980
## reasonhome .
## reasonother .
## reasonreputation .
## nurseryyes .
## internetyes 1.4072077
## guardian.xmother .
## guardian.xother -1.0730092
## traveltime.x .
## studytime.x .
## failures.x -6.0049623
## schoolsup.xyes -3.8902483
## famsup.xyes .
## paid.xyes .
## activities.xyes .
## higher.xyes 1.6877186
## romantic.xyes -3.1478746
## famrel.x 0.4849984
## freetime.x .
## goout.x .
## Dalc.x .
## Walc.x .
## health.x -0.6829832
## absences.x .
best_por <- glmnet(por_dataset_input_matrix, por_dataset_output^(3/2), alpha = 1, lambda = best_cv_por_lambda)
coef(best_por)
## 41 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 30.91548264
## schoolGP 2.60465988
## schoolMS .
## sexM -3.84404037
## age 0.05200777
## addressU 2.98827188
## famsizeLE3 .
## PstatusT -0.32600670
## Medu 1.42640438
## Fedu .
## Mjobhealth .
## Mjobother -1.46996780
## Mjobservices .
## Mjobteacher .
## Fjobhealth -0.63288782
## Fjobother .
## Fjobservices .
## Fjobteacher 4.50288815
## reasonhome .
## reasonother .
## reasonreputation 1.40328894
## nurseryyes .
## internetyes .
## guardian.ymother .
## guardian.yother -1.27949103
## traveltime.y -0.58822944
## studytime.y 2.70704959
## failures.y -1.61527674
## schoolsup.yyes -8.48320440
## famsup.yyes 0.52002106
## paid.yyes -3.59084394
## activities.yyes 0.52166264
## higher.yyes 11.78847191
## romantic.yyes .
## famrel.y .
## freetime.y .
## goout.y -1.09198228
## Dalc.y .
## Walc.y -0.19765834
## health.y -0.94212194
## absences.y -0.18379960
best_mat_lasso <- predict(best_mat, s = best_cv_mat_lambda, type="coefficients")
best_por_lasso <- predict(best_por, s = best_cv_por_lambda, type="coefficients")
mat_dataset_pred <- predict(mat_lasso, s = best_cv_mat_lambda, newx = mat_dataset_input_matrix)
por_dataset_pred <- predict(por_lasso, s = best_cv_por_lambda, newx = por_dataset_input_matrix)
postResample(mat_dataset_pred^(2/3),mat_dataset_output)
## RMSE Rsquared MAE
## 4.0240604 0.2673442 3.0042883
postResample(por_dataset_pred^(2/3),por_dataset_output)
## RMSE Rsquared MAE
## 2.2005416 0.4270536 1.6611030
mat_validation_pred <- predict(mat_lasso, s = best_cv_mat_lambda, newx = mat_validation_input_matrix)
por_validation_pred <- predict(por_lasso, s = best_cv_por_lambda, newx = por_validation_input_matrix)
postResample(mat_validation_pred^(2/3),mat_validation_output)
## RMSE Rsquared MAE
## 4.7071885 0.1065235 3.4617938
postResample(por_validation_pred^(2/3),por_validation_output)
## RMSE Rsquared MAE
## 2.8858860 0.1598111 1.9985826
Train
RMSE: 4.0240604(mat) 2.2005416(por)
R2: 0.2673442(mat) 0.4270536(por)
Test
RMSE: 4.7071885(mat) 2.8858860(por)
R2: 0.1065235(mat) 0.1598111(por)
control <- trainControl(method="repeatedcv", number=10, repeats = 3)
metric <- "RMSE"
set.seed(1)
mat_rf_train <- train(G3.x~., data=mat_dataset, method="rf", importance=TRUE
,tuneGrid= expand.grid(.mtry=sqrt(ncol(mat_dataset_input))),metric=metric, trControl=control)
set.seed(1)
por_rf_train <- train(G3.y~., data=por_dataset, method="rf", importance=TRUE
,tuneGrid= expand.grid(.mtry=sqrt(ncol(por_dataset_input))),metric=metric, trControl=control)
mat_rf_train
## Random Forest
##
## 269 samples
## 30 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 243, 242, 242, 242, 242, 242, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 3.936823 0.2973943 2.996881
##
## Tuning parameter 'mtry' was held constant at a value of 5.477226
por_rf_train
## Random Forest
##
## 269 samples
## 30 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 243, 242, 243, 240, 243, 243, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.345072 0.3521341 1.793668
##
## Tuning parameter 'mtry' was held constant at a value of 5.477226
set.seed(1)
mat_rf_validate <- train(G3.x~., data=mat_validation, method="rf", importance=TRUE
,tuneGrid= expand.grid(.mtry=sqrt(ncol(mat_validation_input))),metric=metric, trControl=control)
set.seed(1)
por_rf_validate <- train(G3.y~., data=por_validation, method="rf", importance=TRUE
,tuneGrid= expand.grid(.mtry=sqrt(ncol(por_validation_input))),metric=metric, trControl=control)
mat_rf_validate
## Random Forest
##
## 113 samples
## 30 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 103, 103, 102, 100, 102, 102, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.417505 0.2494508 3.419602
##
## Tuning parameter 'mtry' was held constant at a value of 5.477226
por_rf_validate
## Random Forest
##
## 113 samples
## 30 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 102, 102, 102, 102, 102, 103, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2.76248 0.2834576 2.02071
##
## Tuning parameter 'mtry' was held constant at a value of 5.477226
Train
RMSE: 3.936823(mat) 2.345072(por)
R2: 0.2973943(mat) 0.3521341(por)
Test
RMSE: 4.417505 (mat) 2.76248(por)
R2: 0.2494508(mat) 0.2834576(por)
set.seed(1)
rf_test_mat <- randomForest(G3.x~ ., data = mat_dataset, mtry = sqrt(ncol(mat_dataset_input)),
importance = TRUE)
rf_test_por <- randomForest(G3.y~ ., data = por_dataset, mtry = sqrt(ncol(por_dataset_input)),
importance = TRUE)
rf_test_mat
##
## Call:
## randomForest(formula = G3.x ~ ., data = mat_dataset, mtry = sqrt(ncol(mat_dataset_input)), importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 15.2736
## % Var explained: 28.18
rf_test_por
##
## Call:
## randomForest(formula = G3.y ~ ., data = por_dataset, mtry = sqrt(ncol(por_dataset_input)), importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 5.618688
## % Var explained: 31.19
varImp(rf_test_mat,scale = FALSE)
## Overall
## school 0.10337412
## sex 0.25837920
## age 1.28329775
## address -0.03755319
## famsize -0.01612318
## Pstatus 0.07495021
## Medu 1.33780382
## Fedu 0.51474905
## Mjob 0.22641953
## Fjob 0.18596111
## reason 0.22430389
## nursery 0.03172398
## internet 0.03299865
## guardian.x 0.30849800
## traveltime.x 0.18483856
## studytime.x -0.08773640
## failures.x 2.79308229
## schoolsup.x 0.29888883
## famsup.x 0.09591145
## paid.x 0.03084162
## activities.x 0.16483758
## higher.x 0.74799765
## romantic.x 0.20851632
## famrel.x 0.18821259
## freetime.x 0.34332828
## goout.x 0.31739163
## Dalc.x 0.49580234
## Walc.x 0.22635352
## health.x 0.29695444
## absences.x 1.87290634
varImp(rf_test_por,scale = FALSE)
## Overall
## school 0.116463590
## sex 0.338447468
## age 0.087977753
## address 0.056843522
## famsize 0.005516336
## Pstatus 0.003139285
## Medu 0.429947062
## Fedu 0.208019782
## Mjob 0.010834143
## Fjob 0.060019191
## reason 0.102027596
## nursery -0.032908549
## internet -0.022446936
## guardian.y 0.038440629
## traveltime.y 0.041510471
## studytime.y 0.683525699
## failures.y 0.342778818
## schoolsup.y 0.249717031
## famsup.y 0.059072421
## paid.y 0.006689773
## activities.y 0.103966373
## higher.y 0.558579799
## romantic.y 0.018487705
## famrel.y -0.020232550
## freetime.y 0.096659654
## goout.y 0.057727484
## Dalc.y 0.157554342
## Walc.y 0.204361949
## health.y 0.089560573
## absences.y 0.042981379
varImpPlot(rf_test_mat)
varImpPlot(rf_test_por)
y_hat_mat_train <- predict(rf_test_mat, mat_dataset)
y_hat_mat_train_scored <- as_tibble(cbind(mat_dataset, y_hat_mat_train))
y_hat_mat_train_rmse <- yardstick::rmse(y_hat_mat_train_scored, truth=mat_dataset_output, estimate=y_hat_mat_train)
y_hat_mat_train_rmse # 1.9
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.90
y_hat_por_train <- predict(rf_test_por, por_dataset)
y_hat_por_train_scored <- as_tibble(cbind(por_dataset, y_hat_por_train))
y_hat_por_train_rmse <- yardstick::rmse(y_hat_por_train_scored, truth=por_dataset_output, estimate=y_hat_por_train)
y_hat_por_train_rmse # 1.19
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.19
y_hat_mat_test <- predict(rf_test_mat, mat_validation)
y_hat_mat_test_scored <- as_tibble(cbind(mat_validation, y_hat_mat_test))
y_hat_mat_test_rmse <- yardstick::rmse(y_hat_mat_test_scored, truth=mat_validation_output, estimate=y_hat_mat_test)
y_hat_mat_test_rmse # 4.19
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 4.19
y_hat_por_test <- predict(rf_test_por, por_validation)
y_hat_por_test_scored <- as_tibble(cbind(por_validation, y_hat_por_test))
y_hat_por_test_rmse <- yardstick::rmse(y_hat_por_test_scored, truth=por_validation_output, estimate=y_hat_por_test)
y_hat_por_test_rmse # 2.76
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.76
Train
RMSE: 3.730919(mat) 2.110835(por)
R2: 0.3454868(mat) 0.4543227(por)
Test
RMSE: 3.341797(mat) 2.00442(por)
R2: 0.5227731(mat) 0.5901086(por)
Train
RMSE: 4.0240604(mat) 2.2005416(por)
R2: 0.2673442(mat) 0.4270536(por)
Test
RMSE: 4.7071885(mat) 2.8858860(por)
R2: 0.1065235(mat) 0.1598111(por)
Train
RMSE: 3.936823(mat) 2.345072(por)
R2: 0.2973943(mat) 0.3521341(por)
Test
RMSE: 4.417505 (mat) 2.76248(por)
R2: 0.2494508(mat) 0.2834576(por)
library(ROSE)
## Loaded ROSE 0.0-4
bank = read.table("bank.csv",sep=";",header=TRUE, stringsAsFactors = T)
* Exclude column “duration” since our intention is to have a realistic predictive model.
bank1 <- bank[,-12]
* set “yes” as the positive class
bank1$y <- relevel(bank1$y, ref = "yes")
levels(bank1$y)
## [1] "yes" "no"
* Create a list of 70% of the rows in the bank1 dataset.
set.seed(1)
bank1_training_sample <- createDataPartition(bank1$y,p = 0.7, list = FALSE)
* Use the 70% of data for training and testing the models.
bank1_dataset<- bank1[bank1_training_sample,]
* Use the 30% of the data to validate the model.
bank1_validation <- bank1[-bank1_training_sample,]
* split input and output for training and validation
output <- bank1_dataset[16]
input <- bank1_dataset[1:15]
bank1_dataset_output <- bank1_dataset[16]
bank1_dataset_input <- bank1_dataset[1:15]
bank1_validation_output <- bank1_validation[16]
bank1_validation_input <- bank1_validation[1:15]
control <- trainControl(method="repeatedcv", number=10, repeats = 3)
metric <- "Accuracy"
set.seed(1)
bank1_fit_glm <- train(y~., data=bank1_dataset, method="glm", metric=metric, trControl=control, family="binomial")
confusionMatrix(bank1_fit_glm)
## Cross-Validated (10 fold, repeated 3 times) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction yes no
## yes 1.9 1.1
## no 9.7 87.4
##
## Accuracy (average) : 0.8924
bank1_pred <- predict(bank1_fit_glm, bank1_validation[,-16])
head(data.frame(original = bank1_validation_output, pred = bank1_pred))
## y pred
## 2 no no
## 8 no no
## 21 no no
## 22 no no
## 25 no no
## 32 no no
table(bank1_dataset_output)
## bank1_dataset_output
## yes no
## 365 2800
From the table, we can observe that “yes” is much less compared to “no”. This may cause class imbalance which results to bias.
Since we are interested on the clients status of subscription after the phone call, we need to revise the model because not subscribed status is dominated over subscribed status.
prop.table(table(bank1_dataset_output))
## bank1_dataset_output
## yes no
## 0.1153239 0.8846761
par(mfrow=c(1, 1))
barplot(prop.table(table(bank1_dataset_output)),
ylim = c(0, 0.9),
main = "Class Distribution")
set.seed(1)
undersampling <- ovun.sample(y~., data=bank1_dataset, method = "under",N = 720 ,seed = 1)$data
table(undersampling$y)
##
## no yes
## 355 365
oversampling <- ovun.sample(y~., data=bank1_dataset, method = "over",N = 5600,seed = 1)$data
table(oversampling$y)
##
## no yes
## 2800 2800
bothsampling <- ovun.sample(y~., data=bank1_dataset, method = "both",N = 3165, p=.5 ,seed = 1)$data
table(bothsampling$y)
##
## no yes
## 1643 1522
library(rpart)
set.seed(1)
tree.over <- rpart(y~., data=oversampling)
tree.under <- rpart(y~., data=undersampling)
tree.both <- rpart(y~., data=bothsampling)
pred.tree.over <- predict(tree.over, newdata = bank1_dataset)
pred.tree.under <- predict(tree.under, newdata = bank1_dataset)
pred.tree.both <- predict(tree.both, newdata = bank1_dataset)
roc.curve(bank1_dataset$y, pred.tree.under[,2])
## Area under the curve (AUC): 0.684
roc.curve(bank1_dataset$y, pred.tree.over[,2])
## Area under the curve (AUC): 0.732
roc.curve(bank1_dataset$y, pred.tree.both[,2])
## Area under the curve (AUC): 0.756
set.seed(1)
bank1_fit_glm_bothsampling <- train(y~., data=bothsampling, method="glm", metric=metric, trControl=control, family="binomial")
bank1_fit_glm_bothsampling_pred <- predict(bank1_fit_glm_bothsampling, bank1_validation)
confusionMatrix(bank1_fit_glm_bothsampling_pred, bank1_validation$y,positive="yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 98 304
## no 58 896
##
## Accuracy : 0.733
## 95% CI : (0.7086, 0.7564)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2223
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.62821
## Specificity : 0.74667
## Pos Pred Value : 0.24378
## Neg Pred Value : 0.93920
## Prevalence : 0.11504
## Detection Rate : 0.07227
## Detection Prevalence : 0.29646
## Balanced Accuracy : 0.68744
##
## 'Positive' Class : yes
##
varImp(bank1_fit_glm_bothsampling,scale=FALSE)
## glm variable importance
##
## only 20 most important variables shown (out of 41)
##
## Overall
## poutcomesuccess 8.015
## contactunknown 7.390
## loanyes 4.506
## monthmar 4.404
## monthjan 4.342
## monthoct 4.301
## campaign 3.800
## monthnov 3.767
## maritalmarried 3.402
## jobhousemaid 3.026
## jobretired 2.986
## educationtertiary 2.523
## monthdec 2.493
## defaultyes 2.065
## `jobself-employed` 1.910
## contacttelephone 1.779
## educationunknown 1.696
## educationsecondary 1.675
## day 1.635
## monthjul 1.481
control1 <- trainControl(method="cv", number=10)
set.seed(1)
bank1_fit_rf <- train(y~., data=bank1_dataset, method="rf", metric=metric, trControl=control1)
bank1_fit_rf
## Random Forest
##
## 3165 samples
## 15 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2849, 2849, 2849, 2849, 2848, 2848, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8872050 0.05306595
## 21 0.8897297 0.20918673
## 41 0.8894112 0.23048059
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 21.
summary(bank1_fit_rf)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 3165 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 6330 matrix numeric
## oob.times 3165 -none- numeric
## classes 2 -none- character
## importance 41 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3165 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 41 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 0 -none- list
confusionMatrix(bank1_fit_rf)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction yes no
## yes 1.9 1.4
## no 9.7 87.1
##
## Accuracy (average) : 0.8897
bank1_pred_rf <- predict(bank1_fit_rf, bank1_validation[,-16])
bank1_pred_rf %>% confusionMatrix(bank1_validation$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 25 22
## no 131 1178
##
## Accuracy : 0.8872
## 95% CI : (0.8691, 0.9035)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.4198
##
## Kappa : 0.2039
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.16026
## Specificity : 0.98167
## Pos Pred Value : 0.53191
## Neg Pred Value : 0.89992
## Prevalence : 0.11504
## Detection Rate : 0.01844
## Detection Prevalence : 0.03466
## Balanced Accuracy : 0.57096
##
## 'Positive' Class : yes
##
set.seed(1)
bank1_fit_rf_bothsampling <- train(y~., data=bothsampling, method="rf", metric=metric, trControl=control1)
bank1_fit_rf_bothsampling
## Random Forest
##
## 3165 samples
## 15 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2848, 2849, 2849, 2848, 2849, 2849, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7829354 0.5621975
## 21 0.9703001 0.9406016
## 41 0.9658777 0.9317837
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 21.
varImp(bank1_fit_rf_bothsampling,scale=FALSE)
## rf variable importance
##
## only 20 most important variables shown (out of 41)
##
## Overall
## balance 287.53
## age 227.46
## day 203.47
## campaign 91.97
## poutcomesuccess 82.77
## pdays 54.51
## contactunknown 51.19
## previous 38.46
## housingyes 32.87
## maritalmarried 31.50
## jobblue-collar 28.77
## loanyes 28.38
## educationsecondary 27.65
## jobtechnician 24.06
## educationtertiary 24.02
## maritalsingle 23.59
## monthmar 23.00
## monthmay 20.39
## monthnov 19.71
## monthaug 19.30
confusionMatrix(bank1_fit_rf_bothsampling)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction no yes
## no 49.5 0.6
## yes 2.4 47.5
##
## Accuracy (average) : 0.9703
bank1_pred_rf_bothsampling <- predict(bank1_fit_rf_bothsampling, bank1_validation[,-16])
bank1_pred_rf_bothsampling %>% confusionMatrix(bank1_validation$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 55 103
## no 101 1097
##
## Accuracy : 0.8496
## 95% CI : (0.8294, 0.8682)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.2653
##
## Mcnemar's Test P-Value : 0.9442
##
## Sensitivity : 0.35256
## Specificity : 0.91417
## Pos Pred Value : 0.34810
## Neg Pred Value : 0.91569
## Prevalence : 0.11504
## Detection Rate : 0.04056
## Detection Prevalence : 0.11652
## Balanced Accuracy : 0.63337
##
## 'Positive' Class : yes
##